In [ ]:
## This note book gives the trend of a single word in single mailing list.
In [30]:
%matplotlib inline
In [31]:
from bigbang.archive import Archive
import bigbang.parse as parse
import bigbang.graph as graph
import bigbang.mailman as mailman
import bigbang.process as process
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from pprint import pprint as pp
import pytz
import numpy as np
import math
import nltk
from itertools import repeat
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
from nltk.corpus import stopwords
import re
In [32]:
urls = ["6lo"]
# ["http://mail.python.org/pipermail/ipython-dev/"]#,
#"http://mail.python.org/pipermail/ipython-user/"],
#"http://mail.python.org/pipermail/scipy-dev/",
#"http://mail.python.org/pipermail/scipy-user/",
#"http://mail.python.org/pipermail/numpy-discussion/"]
archives= [Archive(url,archive_dir="../archives") for url in urls]
In [33]:
checkword = "internet" #can change words, should be lower case
You'll need to download some resources for NLTK (the natural language toolkit) in order to do the kind of processing we want on all the mailing list text. In particular, for this notebook you'll need punkt, the Punkt Tokenizer Models.
To download, from an interactive Python shell, run:
import nltk
nltk.download()
And in the graphical UI that appears, choose "punkt" from the All Packages tab and Download.
In [29]:
df = pd.DataFrame(columns=["MessageId","Date","From","In-Reply-To","Count"])
for row in archives[0].data.iterrows():
try:
w = row[1]["Body"].replace("'", "")
k = re.sub(r'[^\w]', ' ', w)
k = k.lower()
t = nltk.tokenize.word_tokenize(k)
subdict = {}
count = 0
for g in t:
try:
word = st.stem(g)
except:
print(g)
pass
if word == checkword:
count += 1
if count == 0:
continue
else:
subdict["MessageId"] = row[0]
subdict["Date"] = row[1]["Date"]
subdict["From"] = row[1]["From"]
subdict["In-Reply-To"] = row[1]["In-Reply-To"]
subdict["Count"] = count
df = df.append(subdict,ignore_index=True)
except:
if row[1]["Body"] is None:
print('!!! Detected an email with an empty Body field...')
else: print('error')
In [18]:
df[:5] #dataframe of informations of the particular word.
Out[18]:
Group the dataframe by the month and year, and aggregate the counts for the checkword during each month to get a quick histogram of how frequently that word has been used over time.
In [19]:
df.groupby([df.Date.dt.year, df.Date.dt.month]).agg({'Count':np.sum}).plot(y='Count')
In [ ]:
In [ ]: